library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) 
library(leaps) 
library(glmnet) 
## Loading required package: Matrix
## Loaded glmnet 3.0-2
library(pls) 
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(class)

Import Data

First we import data and see different parameters of this dataset. Therefore we have something in mind about which parameters are quantiliative and which parameters are qualitative.

#Import data
train <- read.csv('train.csv', header = TRUE) 
test <- read.csv('test.csv', header = TRUE) 
summary(train)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour  Utilities   
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63    AllPub:1459  
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50    NoSeWa:   1  
##  Median :  9478               NA's:1369   IR3: 10   Low:  36                 
##  Mean   : 10517                           Reg:925   Lvl:1311                 
##  3rd Qu.: 11602                                                              
##  Max.   :215245                                                              
##                                                                              
##    LotConfig    LandSlope   Neighborhood   Condition1     Condition2  
##  Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260   Norm   :1445  
##  CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81   Feedr  :   6  
##  FR2    :  47   Sev:  13   OldTown:113   Artery :  48   Artery :   2  
##  FR3    :   4              Edwards:100   RRAn   :  26   PosN   :   2  
##  Inside :1052              Somerst: 86   PosN   :  19   RRNn   :   2  
##                            Gilbert: 79   RRAe   :  11   PosA   :   1  
##                            (Other):707   (Other):  15   (Other):   2  
##    BldgType      HouseStyle   OverallQual      OverallCond      YearBuilt   
##  1Fam  :1220   1Story :726   Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  2fmCon:  31   2Story :445   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Duplex:  52   1.5Fin :154   Median : 6.000   Median :5.000   Median :1973  
##  Twnhs :  43   SLvl   : 65   Mean   : 6.099   Mean   :5.575   Mean   :1971  
##  TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                1.5Unf : 14   Max.   :10.000   Max.   :9.000   Max.   :2010  
##                (Other): 19                                                  
##   YearRemodAdd    RoofStyle       RoofMatl     Exterior1st   Exterior2nd 
##  Min.   :1950   Flat   :  13   CompShg:1434   VinylSd:515   VinylSd:504  
##  1st Qu.:1967   Gable  :1141   Tar&Grv:  11   HdBoard:222   MetalSd:214  
##  Median :1994   Gambrel:  11   WdShngl:   6   MetalSd:220   HdBoard:207  
##  Mean   :1985   Hip    : 286   WdShake:   5   Wd Sdng:206   Wd Sdng:197  
##  3rd Qu.:2004   Mansard:   7   ClyTile:   1   Plywood:108   Plywood:142  
##  Max.   :2010   Shed   :   2   Membran:   1   CemntBd: 61   CmentBd: 60  
##                                (Other):   2   (Other):128   (Other):136  
##    MasVnrType    MasVnrArea     ExterQual ExterCond  Foundation  BsmtQual  
##  BrkCmn : 15   Min.   :   0.0   Ex: 52    Ex:   3   BrkTil:146   Ex  :121  
##  BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28   CBlock:634   Fa  : 35  
##  None   :864   Median :   0.0   Gd:488    Gd: 146   PConc :647   Gd  :618  
##  Stone  :128   Mean   : 103.7   TA:906    Po:   1   Slab  : 24   TA  :649  
##  NA's   :  8   3rd Qu.: 166.0             TA:1282   Stone :  6   NA's: 37  
##                Max.   :1600.0                       Wood  :  3             
##                NA's   :8                                                   
##  BsmtCond    BsmtExposure BsmtFinType1   BsmtFinSF1     BsmtFinType2
##  Fa  :  45   Av  :221     ALQ :220     Min.   :   0.0   ALQ :  19   
##  Gd  :  65   Gd  :134     BLQ :148     1st Qu.:   0.0   BLQ :  33   
##  Po  :   2   Mn  :114     GLQ :418     Median : 383.5   GLQ :  14   
##  TA  :1311   No  :953     LwQ : 74     Mean   : 443.6   LwQ :  46   
##  NA's:  37   NA's: 38     Rec :133     3rd Qu.: 712.2   Rec :  54   
##                           Unf :430     Max.   :5644.0   Unf :1256   
##                           NA's: 37                      NA's:  38   
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741   
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49   
##  Median :   0.00   Median : 477.5   Median : 991.5   GasW :  18   Gd:241   
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1   
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428   
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0   Wall :   4            
##                                                                            
##  CentralAir Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  N:  95     FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  Y:1365     FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##             FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##             Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##             SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##             NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                         
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual  TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100      Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39      1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586      Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735      Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000               3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000               Max.   :14.000  
##                                                                               
##  Functional    Fireplaces    FireplaceQu   GarageType   GarageYrBlt  
##  Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6   Min.   :1900  
##  Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870   1st Qu.:1961  
##  Min1:  31   Median :1.000   Gd  :380    Basment: 19   Median :1980  
##  Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88   Mean   :1979  
##  Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9   3rd Qu.:2002  
##  Sev :   1   Max.   :3.000   NA's:690    Detchd :387   Max.   :2010  
##  Typ :1360                               NA's   : 81   NA's   :81    
##  GarageFinish   GarageCars      GarageArea     GarageQual  GarageCond 
##  Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3   Ex  :   2  
##  RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48   Fa  :  35  
##  Unf :605     Median :2.000   Median : 480.0   Gd  :  14   Gd  :   9  
##  NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3   Po  :   7  
##               3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311   TA  :1326  
##               Max.   :4.000   Max.   :1418.0   NA's:  81   NA's:  81  
##                                                                       
##  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Y:1340     Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##             Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##             3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##             Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                                
##   ScreenPorch        PoolArea        PoolQC       Fence      MiscFeature
##  Min.   :  0.00   Min.   :  0.000   Ex  :   2   GdPrv:  59   Gar2:   2  
##  1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2   GdWo :  54   Othr:   2  
##  Median :  0.00   Median :  0.000   Gd  :   3   MnPrv: 157   Shed:  49  
##  Mean   : 15.06   Mean   :  2.759   NA's:1453   MnWw :  11   TenC:   1  
##  3rd Qu.:  0.00   3rd Qu.:  0.000               NA's :1179   NA's:1406  
##  Max.   :480.00   Max.   :738.000                                       
##                                                                         
##     MiscVal             MoSold           YrSold        SaleType   
##  Min.   :    0.00   Min.   : 1.000   Min.   :2006   WD     :1267  
##  1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007   New    : 122  
##  Median :    0.00   Median : 6.000   Median :2008   COD    :  43  
##  Mean   :   43.49   Mean   : 6.322   Mean   :2008   ConLD  :   9  
##  3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009   ConLI  :   5  
##  Max.   :15500.00   Max.   :12.000   Max.   :2010   ConLw  :   5  
##                                                     (Other):   9  
##  SaleCondition    SalePrice     
##  Abnorml: 101   Min.   : 34900  
##  AdjLand:   4   1st Qu.:129975  
##  Alloca :  12   Median :163000  
##  Family :  20   Mean   :180921  
##  Normal :1198   3rd Qu.:214000  
##  Partial: 125   Max.   :755000  
## 
library(ggplot2)
par(mfrow=c(1,2))
ggplot(data = train, aes(x = SalePrice)) + geom_histogram(bins = 50)

ggplot(data = train, aes(x = log(SalePrice+1))) + geom_histogram(bins = 50)

In the next couple steps, we discuss each parameters in the dataset and preprossing the whole dataset. Therefore we can have a nice dataset to drive into our modeling in the next couple steps. Also, It’s very important to be noticed that we use log transformation in Saleprice or y in this problem since we discover some right-skew distribution in the original dataset.

train$SalePrice <- log(train$SalePrice+1)

# save the id columns that are needed to compose the submission file 
train.id <- train$Id 
test.id <- test$Id

# add SalePrice column to test set 
test$SalePrice <- NA

# combine the training and test sets 
combined <- rbind(train, test)

# drop id column which is unnecessary for the prediction 
combined <- combined[, -1]
combined$GarageYrBlt[which(combined$GarageYrBlt == 2207)] <- 2007

library(e1071)
# convert MSSubclass that represents categorical variable to factor 
combined$MSSubClass <- as.factor(combined$MSSubClass)

# convert MoSold and YrSold to factors 
combined$MoSold <- as.factor(combined$MoSold) 
combined$YrSold <- as.factor(combined$YrSold)

# impute LotFrontage with mean 
combined$LotFrontage[is.na(combined$LotFrontage)] <- mean(combined$LotFrontage[!is.na(combined$LotFrontage)])

# MasVnrArea 
combined$MasVnrArea[is.na(combined$MasVnrType)] <- 0
  
# Basement
combined$BsmtFinSF1[is.na(combined$BsmtFinSF1) & is.na(combined$BsmtQual)] <- 0 
combined$BsmtFinSF2[is.na(combined$BsmtFinSF2) & is.na(combined$BsmtQual)] <- 0 
combined$BsmtUnfSF[is.na(combined$BsmtUnfSF) & is.na(combined$BsmtQual)] <- 0 
combined$TotalBsmtSF[is.na(combined$TotalBsmtSF) & is.na(combined$BsmtQual)] <- 0

# Bath
combined$BsmtFullBath[is.na(combined$BsmtFullBath)] <- 0 
combined$BsmtHalfBath[is.na(combined$BsmtHalfBath)] <- 0

# Garage
combined$GarageCars[is.na(combined$GarageCars)] <- 0
combined$GarageYrBlt[is.na(combined$GarageYrBlt)] <- combined$YearBuilt[is.na(combined$GarageYrBlt)]
combined$GarageArea[is.na(combined$GarageArea)] <- 0

# fix skewness for numeric variables
var.classes <- sapply(names(combined), function(x){class(combined[[x]])}) 
numeric.col <- names(var.classes[var.classes != "factor"])
numeric.col <- numeric.col[-length(numeric.col)]
skew <- sapply(numeric.col, function(x){skewness(combined[[x]], na.rm = TRUE)}) 
skew <- skew[skew > 0.75]
for(x in names(skew)) {
combined[[x]] <- log(combined[[x]] + 1)
}

# Alley
combined$Alley <- as.character(combined$Alley) 
combined$Alley[is.na(combined$Alley)] <- "None" 
combined$Alley <- as.factor(combined$Alley)

# drop Utilities that is not significant for prediction
combined$Utilities <- NULL

# Basement
combined$BsmtQual <- as.character(combined$BsmtQual) 
combined$BsmtCond <- as.character(combined$BsmtCond) 
combined$BsmtExposure <- as.character(combined$BsmtExposure) 
combined$BsmtFinType1 <- as.character(combined$BsmtFinType1) 
combined$BsmtFinType2 <- as.character(combined$BsmtFinType2) 
combined$BsmtQual[is.na(combined$BsmtQual)] <- "None" 
combined$BsmtCond[is.na(combined$BsmtCond)] <- "None" 
combined$BsmtExposure[is.na(combined$BsmtExposure)] <- "None" 
combined$BsmtFinType1[is.na(combined$BsmtFinType1)] <- "None" 
combined$BsmtFinType2[is.na(combined$BsmtFinType2)] <- "None" 
combined$BsmtQual <- as.factor(combined$BsmtQual) 
combined$BsmtCond <- as.factor(combined$BsmtCond) 
combined$BsmtExposure <- as.factor(combined$BsmtExposure) 
combined$BsmtFinType1 <- as.factor(combined$BsmtFinType1) 
combined$BsmtFinType2 <- as.factor(combined$BsmtFinType2)

# impute Electrical with mode
combined$Electrical <- as.character(combined$Electrical) 
combined$Electrical[is.na(combined$Electrical)] <- "SBrkr" 
combined$Electrical <- as.factor(combined$Electrical)
# impute KitchenQual with average (TA) based on the OverallQual
combined$KitchenQual <- as.character(combined$KitchenQual) 
combined$KitchenQual[is.na(combined$KitchenQual)] <- "TA" 
combined$KitchenQual <- as.factor(combined$KitchenQual)

# impute Functional with mode
combined$Functional <- as.character(combined$Functional) 
combined$Functional[is.na(combined$Functional)] <- "Typ" 
combined$Functional <- as.factor(combined$Functional)

# FireplaceQu
combined$FireplaceQu <- as.character(combined$FireplaceQu) 
combined$FireplaceQu[is.na(combined$FireplaceQu)] <- "None" 
combined$FireplaceQu <- as.factor(combined$FireplaceQu)

# Garage
combined$GarageFinish <- as.character(combined$GarageFinish) 
combined$GarageQual <- as.character(combined$GarageQual) 
combined$GarageCond <- as.character(combined$GarageCond) 
combined$GarageFinish[is.na(combined$GarageFinish)] <- "None" 
combined$GarageQual[is.na(combined$GarageQual)] <- "None" 
combined$GarageCond[is.na(combined$GarageCond)] <- "None" 
combined$GarageFinish <- as.factor(combined$GarageFinish) 
combined$GarageQual <- as.factor(combined$GarageQual) 
combined$GarageCond <- as.factor(combined$GarageCond)

# PoolQC
combined$PoolQC <- as.character(combined$PoolQC) 
combined$PoolQC[is.na(combined$PoolQC)] <- "None" 
combined$PoolQC <- as.factor(combined$PoolQC)

# Fence
combined$Fence <- as.character(combined$Fence) 
combined$Fence[is.na(combined$Fence)] <- "None" 
combined$Fence <- as.factor(combined$Fence)

# impute MSZoning with mode
combined$MSZoning <- as.character(combined$MSZoning) 
combined$MSZoning[is.na(combined$MSZoning)] <- "RL" 
combined$MSZoning <- as.factor(combined$MSZoning)

# impute Exterior1st and Exterior2nd with mode 
combined$Exterior1st <- as.character(combined$Exterior1st) 
combined$Exterior2nd <- as.character(combined$Exterior2nd) 
combined$Exterior1st[is.na(combined$Exterior1st)] <- "VinylSd" 
combined$Exterior2nd[is.na(combined$Exterior2nd)] <- "HdBoard" 
combined$Exterior1st <- as.factor(combined$Exterior1st) 
combined$Exterior2nd <- as.factor(combined$Exterior2nd)

# MasVnrType
combined$MasVnrType <- as.character(combined$MasVnrType) 
combined$MasVnrType[is.na(combined$MasVnrType)] <- "None" 
combined$MasVnrType <- as.factor(combined$MasVnrType)

# GarageType
combined$GarageType <- as.character(combined$GarageType) 
combined$GarageType[is.na(combined$GarageType)] <- "None" 
combined$GarageType <- as.factor(combined$GarageType)

# MiscFeature
combined$MiscFeature <- as.character(combined$MiscFeature) 
combined$MiscFeature[is.na(combined$MiscFeature)] <- "None" 
combined$MiscFeature <- as.factor(combined$MiscFeature)

# SaleType
combined$SaleType <- as.character(combined$SaleType) 
combined$SaleType[is.na(combined$SaleType)] <- "Oth" 
combined$SaleType <- as.factor(combined$SaleType)


train <- combined[!is.na(combined$SalePrice),] 
test <- combined[is.na(combined$SalePrice),]

From above analysis, we found our best parameter for modeling this dataset and use them in below each model.

Different Model for the dataset

Linear Regression
set.seed(421)

fold.index <- cut(sample(1:nrow(train)), breaks=10, labels=FALSE) 
pred.error <- rep(0, 10)
for (i in 1:10) {
test.index <- which(fold.index==i)
sp.lm.mod <- lm(SalePrice~MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
               YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath
             +BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
               GarageCars+GarageArea+WoodDeckSF+ScreenPorch,train[-test.index,])
pred.lm <- predict(sp.lm.mod, train[test.index, ]) 
true.lm <- train[test.index, ]$SalePrice 
pred.error[i] <- mean((pred.lm-true.lm)^2)
}
# cv estimate
mse.lm <- mean(pred.error) 
mse.lm
## [1] 0.01959431
sp.lm.mod <- lm(SalePrice~MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
               YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath
             +BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
               GarageCars+GarageArea+WoodDeckSF+ScreenPorch,train)
pred.lm <- predict(sp.lm.mod, test)
pred.lm[which(is.na(pred.lm))] <- mean(pred.lm, na.rm = TRUE) 
test.lm <- data.frame(Id = test.id, SalePrice = exp(pred.lm)-1) 
head(test.lm,5)
##        Id SalePrice
## 1461 1461  129581.6
## 1462 1462  156766.0
## 1463 1463  181371.1
## 1464 1464  205131.7
## 1465 1465  198316.5
write.csv(test.lm,file='Linear Regression Model Housing Price.csv',row.names = FALSE)
Subset Selection Methods
predict.regsubsets = function(object, newdata, id){ 
  form = as.formula(object$call[[2]])
  mat = model.matrix(form, newdata)
  coefi = coef(object, id=id)
  xvars = names(coefi)
  mat[,xvars]%*%coefi
}
library(glmnet)

fit.best <- regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
               YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath
             +BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
               GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                       data = train, nvmax = 27)
best.summary <- summary(fit.best) 
which.min(best.summary$cp)
## [1] 24
which.min(best.summary$bic)
## [1] 21
which.max(best.summary$adjr2)
## [1] 24
par(mfrow = c(2, 2))
plot(best.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l") 
points(24, best.summary$cp[24], pch = 4, col = "red", lwd = 7)
plot(best.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l") 
points(21, best.summary$bic[21], pch = 4, col = "red", lwd = 7)
plot(best.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, type = "l") 
points(25, best.summary$adjr2[25], pch = 4, col = "red", lwd = 7)
coef(fit.best, which.max(best.summary$adjr2))
##  (Intercept)   MSZoningFV   MSZoningRH   MSZoningRL   MSZoningRM      LotArea 
##  0.518572979  0.389170744  0.359652251  0.324018606  0.286519770  0.085466357 
##  OverallQual  OverallCond    YearBuilt YearRemodAdd   BsmtFinSF1    BsmtUnfSF 
##  0.086852678  0.043388514  0.002278495  0.001186687  0.010678988  0.004855216 
##    X1stFlrSF    X2ndFlrSF LowQualFinSF BsmtFullBath     FullBath     HalfBath 
##  0.335034068  0.019963961  0.009472446  0.039000037  0.041368414  0.031215373 
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd   Fireplaces   GarageCars   WoodDeckSF 
## -0.015965582 -0.211327411  0.128039872  0.030903591  0.070370134  0.004130451 
##  ScreenPorch 
##  0.009728612

According to the above summary, the best subset selection with cp choosing 24 variables, BIC choosing 21 variables and adjusted R^2 choosing 25 variables

k = 10
set.seed(123)
folds = sample(1:k, nrow(train), replace = TRUE) 
cv.errors = matrix(NA, k, 27, dimnames = list(NULL, paste(1:27))) 
for(j in 1:k){
  fit.best = regsubsets(SalePrice ~
                          MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch, data = train[folds != j,],
                        nvmax=27)
  for (i in 1:27){
    pred = predict.regsubsets(fit.best, train[folds == j, ], id = i) 
    cv.errors[j, i] = mean((train$SalePrice[folds == j] - pred)^2)
    }
}
mean.cv.errors = apply(cv.errors, 2, mean) 
mean.cv.errors
##          1          2          3          4          5          6          7 
## 0.05334174 0.04197162 0.03821557 0.03119303 0.02739346 0.02469638 0.02423254 
##          8          9         10         11         12         13         14 
## 0.02185710 0.02229178 0.02142727 0.02157552 0.02109429 0.02156106 0.02132758 
##         15         16         17         18         19         20         21 
## 0.02122264 0.02066810 0.02037142 0.02071274 0.02073043 0.02028587 0.02003563 
##         22         23         24         25         26         27 
## 0.01982793 0.01972963 0.01957019 0.01956300 0.01955967 0.01957147
mse.best <- min(mean.cv.errors) 
names(which.min(mean.cv.errors))                   
## [1] "26"
fit.best = regsubsets(SalePrice ~
                        MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                      train, nvmax = 26)
test.mat = model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+
                           BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+
                           BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ 
                           GarageCars+GarageArea+WoodDeckSF+ScreenPorch, test)
coefi = coef(fit.best, id = 26)
pred.best = test.mat[,names(coefi)]%*%coefi
pred.best = data.frame(exp(pred.best)-1)
test.best <- data.frame(Id = test.id, SalePrice = pred.best[, 1]) 
head(test.best, 5)
##     Id SalePrice
## 1 1461  129653.2
## 2 1462  156881.7
## 3 1463  181505.8
## 4 1464  205273.5
## 5 1465  198391.2
write.csv(test.best,file='Best Subset selection Housing Price.csv',row.names = FALSE)
Forward Selection Method
fit.fwd <- regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                        X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
                        GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                      data = train, nvmax = 27, method='forward')
fwd.summary <- summary(fit.fwd) 
which.min(fwd.summary$cp)
## [1] 24
which.min(fwd.summary$bic)
## [1] 21
which.max(fwd.summary$adjr2)
## [1] 24
par(mfrow = c(2, 2))
plot(fwd.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(24, fwd.summary$cp[24], pch = 4, col = "red", lwd = 7)
plot(fwd.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l") 
points(21, fwd.summary$bic[21], pch = 4, col = "red", lwd = 7)
plot(fwd.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, type = "l") 
points(25, fwd.summary$adjr2[25], pch = 4, col = "red", lwd = 7)
coef(fit.fwd, which.max(fwd.summary$adjr2))
##  (Intercept)   MSZoningFV   MSZoningRH   MSZoningRL   MSZoningRM      LotArea 
##  0.518572979  0.389170744  0.359652251  0.324018606  0.286519770  0.085466357 
##  OverallQual  OverallCond    YearBuilt YearRemodAdd   BsmtFinSF1    BsmtUnfSF 
##  0.086852678  0.043388514  0.002278495  0.001186687  0.010678988  0.004855216 
##    X1stFlrSF    X2ndFlrSF LowQualFinSF BsmtFullBath     FullBath     HalfBath 
##  0.335034068  0.019963961  0.009472446  0.039000037  0.041368414  0.031215373 
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd   Fireplaces   GarageCars   WoodDeckSF 
## -0.015965582 -0.211327411  0.128039872  0.030903591  0.070370134  0.004130451 
##  ScreenPorch 
##  0.009728612

k = 10
set.seed(123)
folds = sample(1:k, nrow(train), replace = TRUE) 
cv.errors = matrix(NA, k, 27, dimnames = list(NULL, paste(1:27)))
for(j in 1:k){
  fit.fwd = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                         X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                       data = train[folds != j,], nvmax = 27, method = "forward")
  for (i in 1:27){
    pred = predict.regsubsets(fit.fwd, train[folds == j, ], id = i)
    cv.errors[j, i] = mean((train$SalePrice[folds == j] - pred)^2) 
    }
}
mean.cv.errors = apply(cv.errors, 2, mean) 
mean.cv.errors
##          1          2          3          4          5          6          7 
## 0.05334174 0.04197162 0.03821557 0.03353788 0.03138078 0.02766656 0.02608801 
##          8          9         10         11         12         13         14 
## 0.02350706 0.02278304 0.02117378 0.02102577 0.02106928 0.02077449 0.02065817 
##         15         16         17         18         19         20         21 
## 0.02052812 0.02039953 0.02028935 0.02028194 0.02029586 0.02026476 0.02038669 
##         22         23         24         25         26         27 
## 0.02013680 0.01974959 0.01957019 0.01956300 0.01955967 0.01957147
mse.fwd <- min(mean.cv.errors) 
names(which.min(mean.cv.errors))     
## [1] "26"
fit.fwd = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                       X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ 
                       GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train, nvmax = 26, method = "forward")
test.mat = model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                           X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
                           GarageCars+GarageArea+WoodDeckSF+ScreenPorch, test)
coefi = coef(fit.fwd, id = 26)
pred.fwd = test.mat[,names(coefi)]%*%coefi
pred.fwd = data.frame(exp(pred.fwd)-1)
test.fwd <- data.frame(Id = test.id, SalePrice = pred.fwd[, 1]) 
head(test.fwd, 5)
##     Id SalePrice
## 1 1461  129653.2
## 2 1462  156881.7
## 3 1463  181505.8
## 4 1464  205273.5
## 5 1465  198391.2
write.csv(test.fwd,file='Forward Selection Housing Price.csv',row.names = FALSE)
Backward Selection
fit.bwd <- regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                        X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ 
                        GarageCars+GarageArea+WoodDeckSF+ScreenPorch, data = train, nvmax = 27, method = 'backward')
bwd.summary <- summary(fit.bwd) 
which.min(bwd.summary$cp)
## [1] 24
which.min(bwd.summary$bic)
## [1] 21
which.max(bwd.summary$adjr2)
## [1] 24
par(mfrow = c(2, 2))
plot(bwd.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(24, bwd.summary$cp[24], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l") 
points(21, bwd.summary$bic[21], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, type = "l") 
points(25, bwd.summary$adjr2[25], pch = 4, col = "red", lwd = 7)
coef(fit.bwd, which.max(bwd.summary$adjr2))
##  (Intercept)   MSZoningFV   MSZoningRH   MSZoningRL   MSZoningRM      LotArea 
##  0.518572979  0.389170744  0.359652251  0.324018606  0.286519770  0.085466357 
##  OverallQual  OverallCond    YearBuilt YearRemodAdd   BsmtFinSF1    BsmtUnfSF 
##  0.086852678  0.043388514  0.002278495  0.001186687  0.010678988  0.004855216 
##    X1stFlrSF    X2ndFlrSF LowQualFinSF BsmtFullBath     FullBath     HalfBath 
##  0.335034068  0.019963961  0.009472446  0.039000037  0.041368414  0.031215373 
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd   Fireplaces   GarageCars   WoodDeckSF 
## -0.015965582 -0.211327411  0.128039872  0.030903591  0.070370134  0.004130451 
##  ScreenPorch 
##  0.009728612

k = 10
set.seed(123)
folds = sample(1:k, nrow(train), replace = TRUE) 
cv.errors = matrix(NA, k, 27, dimnames = list(NULL, paste(1:27))) 
for(j in 1:k){
  best.fit = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                          X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ 
                          GarageCars+GarageArea+WoodDeckSF+ScreenPorch, data = train[folds != j,], nvmax = 27, method = "backward")
  for (i in 1:27){
    pred = predict.regsubsets(best.fit, train[folds == j, ], id = i) 
    cv.errors[j, i] = mean((train$SalePrice[folds == j] - pred)^2)
    }
}
mean.cv.errors = apply(cv.errors, 2, mean) 
mean.cv.errors
##          1          2          3          4          5          6          7 
## 0.05334174 0.04197162 0.03700267 0.03119303 0.02739346 0.02469638 0.02423254 
##          8          9         10         11         12         13         14 
## 0.02228972 0.02222079 0.02190389 0.02175932 0.02209848 0.02208304 0.02113129 
##         15         16         17         18         19         20         21 
## 0.02118976 0.02112810 0.02091671 0.02044549 0.02034798 0.02020231 0.02003563 
##         22         23         24         25         26         27 
## 0.01982793 0.01972963 0.01957019 0.01956300 0.01955967 0.01957147
mse.bwd <- min(mean.cv.errors) 
names(which.min(mean.cv.errors))
## [1] "26"
fit.bwd = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                       X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
                       GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train, nvmax = 26, method = 'backward')
test.mat = model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                           X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+
                           TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch, test)
coefi = coef(fit.bwd, id = 26)
pred.bwd = test.mat[,names(coefi)]%*%coefi
pred.bwd = data.frame(exp(pred.bwd)-1)
test.bwd <- data.frame(Id = test.id, SalePrice = pred.bwd[, 1]) 
head(test.bwd, 5)
##     Id SalePrice
## 1 1461  129653.2
## 2 1462  156881.7
## 3 1463  181505.8
## 4 1464  205273.5
## 5 1465  198391.2
write.csv(test.bwd,file='Backward Selection Housing Price.csv',row.names = FALSE)
Shrinkage Methods
Ridge
set.seed(123)
train.index = sample(nrow(train), 1000)
test.index = -train.index
ridge.train = train[train.index, ]
ridge.test = train[test.index, ]
x.train = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
                       YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+
                         BsmtFullBath+BsmtHalfBath+FullBath+
                         HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+
                         WoodDeckSF+ScreenPorch, ridge.train)[,-1]
x.test = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+
                        BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                        X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+
                        KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                      ridge.test)[,-1]
grid = 10^seq(10, -2, length = 100)
fit.ridge = cv.glmnet(x.train, ridge.train$SalePrice, alpha = 0, lambda = grid, thresh = 1e-12)
lambda = fit.ridge$lambda.min
pred.ridge = predict(fit.ridge, newx = x.test, s = lambda)
# cv estimate
mse.ridge <- mean((ridge.test$SalePrice-pred.ridge)^2)
mse.ridge
## [1] 0.01770622
set.seed(123)
x.train <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
                        YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+ 
                          HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train)[,-1]
x.test <- model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                          X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces
                        +GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                        test)[,-1]
grid <- 10^seq(10, -2, length = 100)
fit.ridge <- cv.glmnet(x.train, train$SalePrice, alpha = 0, lambda = grid, thresh = 1e-12) 
lambda <- fit.ridge$lambda.min
pred.ridge <- data.frame(exp(predict(fit.ridge, newx = x.test, s = lambda)) - 1)
test.ridge <- data.frame(Id = test.id, SalePrice = pred.ridge[, 1]) 
head(test.ridge, 5)
##     Id SalePrice
## 1 1461  129555.9
## 2 1462  156119.4
## 3 1463  181063.7
## 4 1464  205003.7
## 5 1465  197330.6
write.csv(test.ridge,file='Shrinkage-Ridge Housing Price.csv',row.names = FALSE)
Lasso
set.seed(123)
train.index = sample(nrow(train), 1000)
test.index = -train.index
lasso.train = train[train.index, ]
lasso.test = train[test.index, ]
x.train = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
                       YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+
                         BsmtFullBath+BsmtHalfBath+FullBath+
                         HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+
                         WoodDeckSF+ScreenPorch, lasso.train)[,-1]
x.test = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+
                        BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                        X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+
                        KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                     lasso.test)[,-1]
grid = 10^seq(10, -2, length = 100)
fit.lasso = cv.glmnet(x.train, lasso.train$SalePrice, alpha = 1, lambda = grid, thresh = 1e-12)
lambda = fit.lasso$lambda.min
pred.lasso = predict(fit.lasso, newx = x.test, s = lambda)
# cv estimate
mse.lasso <- mean((lasso.test$SalePrice-pred.lasso)^2)
mse.lasso
## [1] 0.01959865
set.seed(123)
x.train <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
                        YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+ 
                          HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train)[,-1]
x.test <- model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                          X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces
                        +GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                        test)[,-1]
grid <- 10^seq(10, -2, length = 100)
fit.lasso <- cv.glmnet(x.train, train$SalePrice, alpha = 1, lambda = grid, thresh = 1e-12) 
lambda <- fit.lasso$lambda.min
pred.lasso <- data.frame(exp(predict(fit.lasso, newx = x.test, s = lambda)) - 1)
test.lasso <- data.frame(Id = test.id, SalePrice = pred.lasso[, 1]) 
head(test.lasso, 5)
##     Id SalePrice
## 1 1461  122247.7
## 2 1462  156569.4
## 3 1463  175096.9
## 4 1464  197648.4
## 5 1465  199438.1
write.csv(test.lasso,file='Shrinkage-lasso Housing Price.csv',row.names = FALSE)
GAM Method

Firstly, we use smooth.spline to find out the best df for each variables in gam model.

smooth.spline(train$SalePrice,train$LotArea,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$LotArea, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$LotArea, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.034062  lambda= 0.01039059 (11 iterations)
## Equivalent Degrees of Freedom (Df): 6.634186
## Penalized Criterion (RSS): 144.8477
## PRESS(l.o.o. CV): 0.2247289
smooth.spline(train$SalePrice,train$OverallQual,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$OverallQual, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$OverallQual, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.024431  lambda= 0.008842534 (9 iterations)
## Equivalent Degrees of Freedom (Df): 6.879738
## Penalized Criterion (RSS): 405.4562
## PRESS(l.o.o. CV): 0.616644
smooth.spline(train$SalePrice,train$OverallCond,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$OverallCond, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$OverallCond, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.9826638  lambda= 0.0044139 (11 iterations)
## Equivalent Degrees of Freedom (Df): 8.044889
## Penalized Criterion (RSS): 728.3422
## PRESS(l.o.o. CV): 1.158312
smooth.spline(train$SalePrice,train$YearBuilt,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$YearBuilt, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$YearBuilt, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.8557543  lambda= 0.0005344867 (11 iterations)
## Equivalent Degrees of Freedom (Df): 12.9412
## Penalized Criterion (RSS): 344826.4
## PRESS(l.o.o. CV): 556.9464
smooth.spline(train$SalePrice,train$YearRemodAdd,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$YearRemodAdd, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$YearRemodAdd, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.004997  lambda= 0.006399878 (12 iterations)
## Equivalent Degrees of Freedom (Df): 7.399341
## Penalized Criterion (RSS): 157106.9
## PRESS(l.o.o. CV): 278.8817
smooth.spline(train$SalePrice,train$BsmtFinSF1,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BsmtFinSF1, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BsmtFinSF1, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.7910247  lambda= 0.0001820888 (11 iterations)
## Equivalent Degrees of Freedom (Df): 16.51093
## Penalized Criterion (RSS): 5662.935
## PRESS(l.o.o. CV): 8.324703
smooth.spline(train$SalePrice,train$BsmtFinSF2,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BsmtFinSF2, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BsmtFinSF2, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.043565  lambda= 0.01215677 (10 iterations)
## Equivalent Degrees of Freedom (Df): 6.403602
## Penalized Criterion (RSS): 1939.963
## PRESS(l.o.o. CV): 3.382815
smooth.spline(train$SalePrice,train$BsmtUnfSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BsmtUnfSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BsmtUnfSF, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.152389  lambda= 0.07439161 (12 iterations)
## Equivalent Degrees of Freedom (Df): 4.261831
## Penalized Criterion (RSS): 1881.646
## PRESS(l.o.o. CV): 3.290543
smooth.spline(train$SalePrice,train$X1stFlrSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$X1stFlrSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$X1stFlrSF, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.208518  lambda= 0.1892522 (18 iterations)
## Equivalent Degrees of Freedom (Df): 3.47602
## Penalized Criterion (RSS): 41.84421
## PRESS(l.o.o. CV): 0.06338194
smooth.spline(train$SalePrice,train$X2ndFlrSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$X2ndFlrSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$X2ndFlrSF, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.9591777  lambda= 0.002986355 (11 iterations)
## Equivalent Degrees of Freedom (Df): 8.784485
## Penalized Criterion (RSS): 7003.273
## PRESS(l.o.o. CV): 10.43444
smooth.spline(train$SalePrice,train$LowQualFinSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$LowQualFinSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$LowQualFinSF, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.163104  lambda= 0.08890663 (12 iterations)
## Equivalent Degrees of Freedom (Df): 4.096689
## Penalized Criterion (RSS): 348.3476
## PRESS(l.o.o. CV): 0.5581096
smooth.spline(train$SalePrice,train$BedroomAbvGr,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BedroomAbvGr, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BedroomAbvGr, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.163691  lambda= 0.08967998 (16 iterations)
## Equivalent Degrees of Freedom (Df): 4.088852
## Penalized Criterion (RSS): 435.4369
## PRESS(l.o.o. CV): 0.6303175
smooth.spline(train$SalePrice,train$KitchenAbvGr,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$KitchenAbvGr, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$KitchenAbvGr, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.8159963  lambda= 0.0002758647 (14 iterations)
## Equivalent Degrees of Freedom (Df): 15.02772
## Penalized Criterion (RSS): 4.376797
## PRESS(l.o.o. CV): 0.007769115
smooth.spline(train$SalePrice,train$TotRmsAbvGrd,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$TotRmsAbvGrd, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$TotRmsAbvGrd, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.324724  lambda= 1.307238 (18 iterations)
## Equivalent Degrees of Freedom (Df): 2.433439
## Penalized Criterion (RSS): 20.52923
## PRESS(l.o.o. CV): 0.03258072
smooth.spline(train$SalePrice,train$GarageArea,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$GarageArea, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$GarageArea, cv = TRUE)
## 
## Smoothing Parameter  spar= 1.054994  lambda= 0.01470245 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.134974
## Penalized Criterion (RSS): 16805833
## PRESS(l.o.o. CV): 26261.65
smooth.spline(train$SalePrice,train$WoodDeckSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$WoodDeckSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$WoodDeckSF, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.8717063  lambda= 0.0006969245 (15 iterations)
## Equivalent Degrees of Freedom (Df): 12.1895
## Penalized Criterion (RSS): 3873.201
## PRESS(l.o.o. CV): 5.933387
smooth.spline(train$SalePrice,train$ScreenPorch,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$ScreenPorch, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$ScreenPorch, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.7890874  lambda= 0.0001765097 (10 iterations)
## Equivalent Degrees of Freedom (Df): 16.62785
## Penalized Criterion (RSS): 1146.012
## PRESS(l.o.o. CV): 1.930691
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16.1
sp.gam <- gam(SalePrice~ MSZoning+s(LotArea,df=7)+s(OverallQual,df=7)+s(OverallCond,df=8)+s(YearBuilt,df=13)+s(YearRemodAdd,df=7)+s(BsmtFinSF1,df=16)+s(BsmtFinSF2,df=2)+s(BsmtUnfSF,df=4)+s(X1stFlrSF,df=3)+s(X2ndFlrSF,df=9)+s(LowQualFinSF,df=4)+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+s(BedroomAbvGr,df=4)+s(KitchenAbvGr,df=15)+s(TotRmsAbvGrd,df=2)+Fireplaces+GarageCars+s(GarageArea,df=6)+s(WoodDeckSF,df=12)+s(ScreenPorch,df=17),data=train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
plot(sp.gam,se=TRUE,col='blue')

train.gam <- predict(sp.gam,train)
mse.gam <- mean((train.gam-train$SalePrice)^2)
mse.gam
## [1] 0.01229514
yhat.gam <- predict(sp.gam,newdata = test)
test.gam <- data.frame(Id=test.id,SalePrice=exp(yhat.gam)-1)
head(test.gam,5)
##        Id SalePrice
## 1461 1461  122587.6
## 1462 1462  159071.8
## 1463 1463  186595.6
## 1464 1464  204815.7
## 1465 1465  189920.9
write.csv(test.gam,file = 'GAM Model Housing Price.csv',row.names = FALSE)
Regression Tree Method

Firstly, we use all the variables we use before to create a unpruned tree model.

library(tree)
sp.tree <- tree(SalePrice~.,data=train)
summary(sp.tree)
## 
## Regression tree:
## tree(formula = SalePrice ~ ., data = train)
## Variables actually used in tree construction:
## [1] "OverallQual"  "Neighborhood" "GrLivArea"   
## Number of terminal nodes:  10 
## Residual mean deviance:  0.04059 = 58.86 / 1450 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.133000 -0.108100  0.006889  0.000000  0.116900  0.795200

Tree model MSE for training: 0.04059 Terminal Nodes: 10

plot(sp.tree)
text(sp.tree,pretty=0)

train.tree <- predict(sp.tree,train)
mse.tree <- mean((train.tree-train$SalePrice)^2)
mse.tree
## [1] 0.04031398
sp.tree.pred <- predict(sp.tree,newdata=test)
sp.tree.table <- data.frame(Id=test.id,SalePrice=exp(sp.tree.pred)-1)
head(sp.tree.table,5)
##        Id SalePrice
## 1461 1461  129032.3
## 1462 1462  150665.9
## 1463 1463  183795.7
## 1464 1464  183795.7
## 1465 1465  267584.7
write.csv(sp.tree.table,file='Tree Model Housing Price.csv',row.names = FALSE)
set.seed(422)
cv.bal <- cv.tree(sp.tree,K=10)
best.size <- cv.bal$size[which.min(cv.bal$dev)]
best.size
## [1] 10

Since the cross-validation we ran indicate that the best terminal node size for tree model is 10 which is the same as unpruned tree, we might just choose unpruned tree as our tree regression method.

Bagging
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
sp.bag <- randomForest(SalePrice ~.,data=train,mtry=78,importance=TRUE,ntree=1000)
importance(sp.bag)
##                    %IncMSE IncNodePurity
## MSSubClass     31.67601564  3.084394e+00
## MSZoning        9.00012614  8.706961e-01
## LotFrontage     6.33728592  7.227263e-01
## LotArea        21.49942366  1.956873e+00
## Street          0.16561912  4.742735e-03
## Alley           3.98995513  1.551027e-01
## LotShape        4.30997733  1.037558e-01
## LandContour    -0.62431221  2.135910e-01
## LotConfig       1.07681703  1.441433e-01
## LandSlope       1.12480449  7.026129e-02
## Neighborhood   67.92679121  2.393360e+01
## Condition1      2.14773234  1.849033e-01
## Condition2      0.11759084  1.087263e-02
## BldgType        8.14326005  8.681719e-02
## HouseStyle      5.23765596  1.509648e-01
## OverallQual   109.80356453  1.232858e+02
## OverallCond    26.32059779  1.630250e+00
## YearBuilt      16.27998482  9.648326e-01
## YearRemodAdd   15.94919289  1.122072e+00
## RoofStyle       1.29265881  1.469567e-01
## RoofMatl       -0.33707188  9.990567e-03
## Exterior1st    10.74289088  1.122394e+00
## Exterior2nd    12.92418890  1.477688e+00
## MasVnrType      5.25251570  1.027600e-01
## MasVnrArea      8.84340937  4.365222e-01
## ExterQual       9.74880431  2.472902e-01
## ExterCond       4.57986292  4.294897e-01
## Foundation      6.94048303  1.608849e-01
## BsmtQual        8.92136532  5.924880e-01
## BsmtCond        9.71415625  3.325351e-01
## BsmtExposure    9.12934516  3.803916e-01
## BsmtFinType1   21.27698860  1.218419e+00
## BsmtFinSF1     36.10710742  3.568465e+00
## BsmtFinType2   -0.86540529  1.531030e-01
## BsmtFinSF2      1.16343327  5.072025e-02
## BsmtUnfSF      12.86031058  8.364759e-01
## TotalBsmtSF    47.81515257  6.633855e+00
## Heating        -2.10018188  4.671189e-02
## HeatingQC       5.08458067  3.614053e-01
## CentralAir      9.49644290  1.675958e+00
## Electrical     -2.97826096  1.169865e-01
## X1stFlrSF      27.85335342  4.176168e+00
## X2ndFlrSF      18.45915351  9.548607e-01
## LowQualFinSF   -1.73064696  2.739029e-02
## GrLivArea      97.76838153  2.399348e+01
## BsmtFullBath   12.39139506  2.598507e-01
## BsmtHalfBath    0.47808751  1.716792e-02
## FullBath        7.67993928  2.487244e-01
## HalfBath        9.21884664  1.198790e-01
## BedroomAbvGr    9.03889360  2.262668e-01
## KitchenAbvGr    5.49101435  6.499158e-02
## KitchenQual    15.83004927  5.590770e-01
## TotRmsAbvGrd    7.02945838  4.819138e-01
## Functional      1.89288660  2.827865e-01
## Fireplaces      7.60673353  2.169386e-01
## FireplaceQu    11.74247569  7.703773e-01
## GarageType     18.88794021  1.520622e+00
## GarageYrBlt    11.98895474  7.431595e-01
## GarageFinish   14.42021789  5.658529e-01
## GarageCars     20.98181564  6.264109e+00
## GarageArea     23.78222178  4.349662e+00
## GarageQual      9.09588979  3.384324e-01
## GarageCond      0.04553057  2.783718e-01
## PavedDrive      2.06143311  1.534337e-01
## WoodDeckSF      6.30438075  3.916525e-01
## OpenPorchSF    10.73946847  6.742840e-01
## EnclosedPorch   3.85584688  3.077440e-01
## X3SsnPorch      0.24708882  2.171863e-02
## ScreenPorch     4.32185749  8.813215e-02
## PoolArea        1.69518703  1.027626e-02
## PoolQC         -1.12050999  8.119013e-03
## Fence          -1.54378156  2.195953e-01
## MiscFeature    -1.11821662  1.368547e-02
## MiscVal        -0.65944586  2.577852e-02
## MoSold          1.22088600  3.363362e+00
## YrSold          0.20176443  5.661495e-01
## SaleType        0.59498049  1.387814e-01
## SaleCondition  -2.26924391  5.023724e-01
varImpPlot(sp.bag)

From the above summary, we can conclude that predictor OverallQual is the most important predictor among all since the %IncMSE parameter is the largest among all.

train.bag <- predict(sp.bag,train)
mse.bag <- mean((train.bag-train$SalePrice)^2)
mse.bag
## [1] 0.003282359
sp.bag.pred <- predict(sp.bag,newdata=test)
sp.bag.table <- data.frame(Id=test.id,SalePrice=exp(sp.bag.pred)-1)
head(sp.bag.table,5)
##        Id SalePrice
## 1461 1461  125441.2
## 1462 1462  155426.5
## 1463 1463  184248.1
## 1464 1464  182322.5
## 1465 1465  192991.7
write.csv(sp.bag.table,file = 'Bagging Method Housing Price.csv',row.names = FALSE)
RandomForest Method
set.seed(422)
sp.rf <- randomForest(SalePrice~.,data=train,mtry=round(sqrt(78)),importance=TRUE,ntree=1000)
importance(sp.rf)
##                   %IncMSE IncNodePurity
## MSSubClass    25.84482350    4.97654145
## MSZoning      12.90308438    1.39427090
## LotFrontage   13.36020269    2.02244805
## LotArea       24.07461188    3.71553730
## Street        -1.14816391    0.02675323
## Alley          3.39807288    0.17447754
## LotShape       6.74512237    0.34872657
## LandContour    4.66015037    0.36879936
## LotConfig      0.64077209    0.27122050
## LandSlope      2.96096932    0.20929149
## Neighborhood  33.19627548   21.88434565
## Condition1     5.00183105    0.33181593
## Condition2    -0.21995220    0.04172898
## BldgType      10.76362487    0.38977054
## HouseStyle    14.50711296    0.89750520
## OverallQual   27.20118282   23.46778903
## OverallCond   17.49448212    1.55271957
## YearBuilt     16.67909845    8.50636728
## YearRemodAdd  16.19349570    4.25784547
## RoofStyle      5.16274436    0.45014136
## RoofMatl      -0.04126856    0.12942735
## Exterior1st   13.22455265    1.87670095
## Exterior2nd   12.63420918    1.85948965
## MasVnrType     8.64297207    0.44084464
## MasVnrArea    13.09124065    1.43312465
## ExterQual     16.74938610   11.61587272
## ExterCond      0.29241908    0.41529118
## Foundation     8.94845843    2.39261832
## BsmtQual      13.94192419    6.77671953
## BsmtCond       8.94329746    0.45382721
## BsmtExposure  10.61186162    0.79766511
## BsmtFinType1  15.18664669    1.75034938
## BsmtFinSF1    25.24737834    3.74003664
## BsmtFinType2   4.26076186    0.30063702
## BsmtFinSF2     2.53430221    0.16736087
## BsmtUnfSF     12.04060622    1.13223991
## TotalBsmtSF   27.58258919    9.51059839
## Heating       -0.90739233    0.17934422
## HeatingQC      9.29688689    0.88920508
## CentralAir    10.38259155    1.43999545
## Electrical    -0.36878717    0.23319167
## X1stFlrSF     28.16320346    7.64389466
## X2ndFlrSF     27.71642487    3.50542009
## LowQualFinSF  -2.24360250    0.06166294
## GrLivArea     36.80667642   21.60881653
## BsmtFullBath  11.07122820    0.44029561
## BsmtHalfBath   2.96084588    0.06785438
## FullBath      14.82919804    5.74872241
## HalfBath      11.93685927    0.55509351
## BedroomAbvGr  13.03808889    1.15412249
## KitchenAbvGr   6.29987235    0.15217718
## KitchenQual   16.63329840    9.86004616
## TotRmsAbvGrd  19.38180601    3.19921199
## Functional     3.47529593    0.32153561
## Fireplaces    16.94473451    3.02607001
## FireplaceQu   20.12083415    5.08909125
## GarageType    14.78646382    4.62269838
## GarageYrBlt   14.41089256    6.08684378
## GarageFinish  13.46340029    5.75787211
## GarageCars    19.01313159    8.68122707
## GarageArea    21.98127382    8.95913597
## GarageQual     5.76043590    0.94502773
## GarageCond     4.56305817    1.03945544
## PavedDrive     5.56635847    0.43625239
## WoodDeckSF     8.27982595    0.76990639
## OpenPorchSF   13.43964706    1.46184375
## EnclosedPorch  0.68257099    0.29622156
## X3SsnPorch    -0.46586811    0.03532769
## ScreenPorch    4.86860341    0.20310578
## PoolArea      -1.80100631    0.03932974
## PoolQC        -1.57508949    0.03928185
## Fence          1.69497692    0.37383928
## MiscFeature    0.42735380    0.05698580
## MiscVal        0.54841276    0.07306551
## MoSold         0.47324469    2.33289778
## YrSold        -0.13197481    0.73029816
## SaleType       2.09530687    0.24001524
## SaleCondition  2.65442366    0.61948718
varImpPlot(sp.rf)

It seems like GrLivingArea predictor is the most important predictors all.

train.random <- predict(sp.rf,train)
mse.random <- mean((train.random-train$SalePrice)^2)
mse.random
## [1] 0.003662686
sp.rf.pred <- predict(sp.rf,newdata=test)
sp.rf.table <- data.frame(Id=test.id,SalePrice=exp(sp.rf.pred)-1)
head(sp.bag.table,5)
##        Id SalePrice
## 1461 1461  125441.2
## 1462 1462  155426.5
## 1463 1463  184248.1
## 1464 1464  182322.5
## 1465 1465  192991.7
write.csv(sp.rf.table,file='RandomForest Housing Price.csv',row.names = FALSE)
Boosting
library(gbm)
## Loaded gbm 2.1.5
sp.gbm <- gbm(SalePrice~.,data=train,distribution='gaussian',shrinkage=0.01,n.tree=1000,interaction.depth=10,cv.folds=10)
summary(sp.gbm)

##                         var      rel.inf
## OverallQual     OverallQual 33.112984707
## Neighborhood   Neighborhood 16.263931578
## GrLivArea         GrLivArea 13.751506507
## TotalBsmtSF     TotalBsmtSF  3.472304640
## KitchenQual     KitchenQual  2.915461002
## ExterQual         ExterQual  2.832610552
## X1stFlrSF         X1stFlrSF  2.533205709
## GarageCars       GarageCars  2.368344382
## BsmtFinSF1       BsmtFinSF1  2.222754203
## GarageArea       GarageArea  2.144671998
## MSSubClass       MSSubClass  1.884125259
## OverallCond     OverallCond  1.421649421
## MoSold               MoSold  1.214812103
## LotArea             LotArea  1.189171617
## YearRemodAdd   YearRemodAdd  1.136942584
## CentralAir       CentralAir  1.104814835
## SaleCondition SaleCondition  0.864849303
## FireplaceQu     FireplaceQu  0.749052240
## GarageType       GarageType  0.622461508
## Exterior1st     Exterior1st  0.600101140
## BsmtFinType1   BsmtFinType1  0.520584175
## Exterior2nd     Exterior2nd  0.501954032
## YearBuilt         YearBuilt  0.470268608
## X2ndFlrSF         X2ndFlrSF  0.462972747
## GarageCond       GarageCond  0.402289088
## GarageFinish   GarageFinish  0.377810787
## BsmtQual           BsmtQual  0.333529334
## BsmtExposure   BsmtExposure  0.327771896
## GarageYrBlt     GarageYrBlt  0.287855940
## MSZoning           MSZoning  0.269561427
## Functional       Functional  0.248397224
## OpenPorchSF     OpenPorchSF  0.244466212
## TotRmsAbvGrd   TotRmsAbvGrd  0.244010579
## FullBath           FullBath  0.219065074
## Condition1       Condition1  0.201544795
## LotFrontage     LotFrontage  0.181715545
## BsmtUnfSF         BsmtUnfSF  0.171245718
## WoodDeckSF       WoodDeckSF  0.158543730
## GarageQual       GarageQual  0.157368687
## Fireplaces       Fireplaces  0.149250165
## PavedDrive       PavedDrive  0.146599939
## ExterCond         ExterCond  0.137079142
## BsmtCond           BsmtCond  0.132023354
## YrSold               YrSold  0.128732966
## ScreenPorch     ScreenPorch  0.120760848
## HeatingQC         HeatingQC  0.118627921
## BsmtFullBath   BsmtFullBath  0.112982757
## SaleType           SaleType  0.097129153
## LandContour     LandContour  0.088556044
## LotConfig         LotConfig  0.076200250
## MasVnrArea       MasVnrArea  0.067747047
## Fence                 Fence  0.067597036
## EnclosedPorch EnclosedPorch  0.058857470
## BedroomAbvGr   BedroomAbvGr  0.043942814
## HalfBath           HalfBath  0.042158707
## RoofMatl           RoofMatl  0.036849322
## BsmtFinType2   BsmtFinType2  0.033596645
## Foundation       Foundation  0.028504070
## Electrical       Electrical  0.028106998
## RoofStyle         RoofStyle  0.014640387
## BsmtFinSF2       BsmtFinSF2  0.010801418
## KitchenAbvGr   KitchenAbvGr  0.010034896
## Alley                 Alley  0.009943481
## LotShape           LotShape  0.009815651
## HouseStyle       HouseStyle  0.009354410
## MasVnrType       MasVnrType  0.009061307
## X3SsnPorch       X3SsnPorch  0.006215119
## LowQualFinSF   LowQualFinSF  0.005383488
## LandSlope         LandSlope  0.004459384
## BsmtHalfBath   BsmtHalfBath  0.002384267
## MiscVal             MiscVal  0.002240571
## BldgType           BldgType  0.001495228
## Heating             Heating  0.001111440
## MiscFeature     MiscFeature  0.001045421
## Street               Street  0.000000000
## Condition2       Condition2  0.000000000
## PoolArea           PoolArea  0.000000000
## PoolQC               PoolQC  0.000000000

From above summary table we can conclude that OverallQual is the most important predictor among all since its relative influence is 34.9 which larger than any of other predictors.

train.gbm <- predict(sp.gbm,train)
## Using 880 trees...
mse.gbm <- mean((train.gbm-train$SalePrice)^2)
mse.gbm
## [1] 0.004839912
sp.gbm.yhat <- predict(sp.gbm,newdata=test,n.trees=which.min(sp.gbm$cv.error))
sp.gbm.table <- data.frame(Id=test.id,SalePrice=exp(sp.gbm.yhat)-1)
head(sp.gbm.table,5)
##     Id SalePrice
## 1 1461  122068.0
## 2 1462  159122.3
## 3 1463  183513.2
## 4 1464  186705.9
## 5 1465  193232.6
write.csv(sp.gbm.table,file='Boosting Housing Price.csv',row.names = FALSE)
KNN
set.seed(123)
x <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
                    YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+
                    HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train)[, -1]
y <- train[, length(train)]
fold.index <- cut(sample(1:nrow(train)), breaks=10, labels=FALSE) 
k.value <- c(1,50,100)
error.k <- rep(0, length(k.value))
counter <- 0
for(k in k.value){
  counter <- counter + 1
  error <- 0
  for(i in 1:10){
    pred.knn <- knn(x[fold.index!=i,], x[fold.index==i,], y[fold.index!=i], k=k)
    error <- error + sum(pred.knn != y[fold.index==i]) 
  }
  error.k[counter] <- error/nrow(train) 
}
print(error.k)
## [1] 0.9958904 0.9958904 0.9904110
set.seed(123)
x.train <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
                        YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+
                          HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch,train)[,-1]
x.test <- model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+ YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
                          X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+ HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
                        test)[,-1]
pred.knn <- knn(x.train, x.test, train$SalePrice, k=10) 
pred.knn <- exp(as.numeric(as.character(pred.knn)))-1 
test.knn <- data.frame(Id = test.id, SalePrice = pred.knn) 
head(test.knn, 5)
##     Id SalePrice
## 1 1461    302000
## 2 1462     97000
## 3 1463    172500
## 4 1464    192000
## 5 1465    173000
write.csv(test.knn,file='KNN Method Housing Price.csv',row.names = FALSE)

Conclusion

Based on training mse

After all the analysis we have done, we can see their training mse to predict what might be the best model for this dataset.

mse.lm
## [1] 0.01959431
mse.best
## [1] 0.01955967
mse.fwd
## [1] 0.01955967
mse.bwd
## [1] 0.01955967
mse.lasso
## [1] 0.01959865
mse.ridge
## [1] 0.01770622
mse.gam
## [1] 0.01229514
mse.tree
## [1] 0.04031398
mse.bag
## [1] 0.003282359
mse.random
## [1] 0.003662686
mse.gbm
## [1] 0.004839912
1-error.k
## [1] 0.004109589 0.004109589 0.009589041

From above data we can conclude, based on the training mse we might want to use bagging method in the testing dataset to for best saleprice prediction. However, We might want to comparing testing MSE or true error form testing dataset and see which one might actually perform the best.

Based on True error

From the true error we ca clear see that boosting method is the best method for this dataset because of the lowest true error and the relatively low training error among all methods.